NOTE: This analysis requires at least 10Gb of RAM to run.
input_folder <- "raw_input"
output_folder <- "results"
output_format <- ".pdf"
size_range <- c(0.0004, 0.015)
label_size_range <- c(0.001, 0.02)
all_size_interval <- c(1, 3000000)
pcr_size_interval <- c(1, 25000)
label_max <- 100
max_taxonomy_depth <- 4
min_seq_count <- NULL
just_bacteria <- TRUE
max_mismatch <- 10 # percentage mismatch tolerated in pcr
pcr_success_cutoff <- 0.90 # Used to subset for graphing
min_seq_length <- 1200 # Use to encourage full length sequences
forward_primer = c("515F" = "GTGYCAGCMGCCGCGGTAA")
reverse_primer = c("806R" = "GGACTACNVGGGTWTCTAAT")
pcr_success_color_scale = c("red", "orange", "yellow", "green", "cyan")
The code below parses and subsets the entire SILVA non-redundant reference database. The object made is quite large.
library(metacoder)
file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE),
regex = "^(.*?) (.*)$",
key = c(id = "obs_info", "class"),
class_sep = ";"))
## user system elapsed
## 627.938 3.735 632.680
print(silva)
## `taxmap` object with data for 147708 taxa and 598470 observations:
##
## ----------------------------------------- taxa -----------------------------------------
## 1, 2, 3, 4, 5, 6, 7 ... 147702, 147703, 147704, 147705, 147706, 147707, 147708
##
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 147,708 x 3
## taxon_ids supertaxon_ids name
## <chr> <chr> <chr>
## 1 1 <NA> Archaea
## 2 2 <NA> Bacteria
## 3 3 <NA> Eukaryota
## 4 4 1 Aenigmarchaeota
## 5 5 1 Aigarchaeota
## 6 6 1 Ancient Archaeal Group(AAG)
## 7 7 1 Crenarchaeota
## # ... with 1.477e+05 more rows
##
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 598,470 x 3
## obs_taxon_ids id
## <chr> <chr>
## 1 143243 >AB001438.1.1662
## 2 111186 >AB006051.1.1796
## 3 116163 >FJ911802.1.1606
## 4 115769 >FJ911814.1.1608
## 5 115905 >FJ911819.1.1607
## 6 115859 >FJ911835.1.1585
## 7 115926 >FJ911842.1.1603
## # ... with 5.985e+05 more rows, and 1 more variables: sequence <chr>
##
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies
if (! is.null(min_seq_count)) {
system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count))
}
if (just_bacteria) {
system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE))
}
## user system elapsed
## 238.321 1.874 240.604
if (! is.null(max_taxonomy_depth)) {
system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth))
}
## user system elapsed
## 25.588 0.000 25.611
print(silva)
## `taxmap` object with data for 5006 taxa and 513121 observations:
##
## ----------------------------------------- taxa -----------------------------------------
## 2, 3423, 3491, 3492, 3493 ... 104672, 104673, 104674, 104675, 104676, 104677
##
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 5,006 x 3
## taxon_ids supertaxon_ids name
## <chr> <chr> <chr>
## 1 2 <NA> Bacteria
## 2 3423 2 Acetothermia
## 3 3491 3423 bacterium SCGC AAA255-C06
## 4 3492 3423 clone OPB14
## 5 3493 3423 bacterium enrichment culture clone 73(2013)
## 6 3494 3423 bacterium enrichment culture clone AOM-SR-B11
## 7 3495 3423 hypersaline lake metagenome
## # ... with 4,999 more rows
##
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 513,121 x 3
## obs_taxon_ids id
## <chr> <chr>
## 1 91658 >AY230195.1.1496
## 2 36076 >AY230764.1.1512
## 3 63763 >AY230816.1.1447
## 4 15361 >AY232254.1.1432
## 5 3543 >AY234438.1.1303
## 6 22606 >AY234499.1.1317
## 7 54118 >AY234528.1.1429
## # ... with 5.131e+05 more rows, and 1 more variables: sequence <chr>
##
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies
These are not bacterial and will bias the in silico PCR results.
system.time(silva <- filter_taxa(silva, name == "Chloroplast", subtaxa = TRUE, invert = TRUE))
## user system elapsed
## 0.297 0.000 0.298
print(silva)
## `taxmap` object with data for 3843 taxa and 513121 observations:
##
## ----------------------------------------- taxa -----------------------------------------
## 2, 3423, 3491, 3492, 3493 ... 104672, 104673, 104674, 104675, 104676, 104677
##
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 3,843 x 3
## taxon_ids supertaxon_ids name
## <chr> <chr> <chr>
## 1 2 <NA> Bacteria
## 2 3423 2 Acetothermia
## 3 3491 3423 bacterium SCGC AAA255-C06
## 4 3492 3423 clone OPB14
## 5 3493 3423 bacterium enrichment culture clone 73(2013)
## 6 3494 3423 bacterium enrichment culture clone AOM-SR-B11
## 7 3495 3423 hypersaline lake metagenome
## # ... with 3,836 more rows
##
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 513,121 x 3
## obs_taxon_ids id
## <chr> <chr>
## 1 91658 >AY230195.1.1496
## 2 36076 >AY230764.1.1512
## 3 63763 >AY230816.1.1447
## 4 15361 >AY232254.1.1432
## 5 3543 >AY234438.1.1303
## 6 22606 >AY234499.1.1317
## 7 54118 >AY234528.1.1429
## # ... with 5.131e+05 more rows, and 1 more variables: sequence <chr>
##
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies
Although SILVA is such a large database ( taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
system.time(silva_plot_all <- heat_tree(silva,
node_size = n_obs,
node_color = n_obs,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = all_size_interval,
edge_size_interval = all_size_interval,
node_color_interval = all_size_interval,
edge_color_interval = all_size_interval,
node_label = name,
node_label_size_range = label_size_range,
node_label_max = label_max,
node_color_axis_label = "Sequence count",
make_legend = TRUE,
output_file = file.path(output_folder, paste0("silva--all", output_format))))
## user system elapsed
## 40.925 0.096 41.078
print(silva_plot_all)
if (! is.null(min_seq_length)) {
system.time(silva <- filter_obs(silva, nchar(sequence) >= min_seq_length, unobserved = FALSE))
}
## user system elapsed
## 2.310 0.008 2.320
print(silva)
## `taxmap` object with data for 3842 taxa and 513120 observations:
##
## ----------------------------------------- taxa -----------------------------------------
## 2, 3423, 3491, 3492, 3493 ... 104672, 104673, 104674, 104675, 104676, 104677
##
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 3,842 x 3
## taxon_ids supertaxon_ids name
## <chr> <chr> <chr>
## 1 2 <NA> Bacteria
## 2 3423 2 Acetothermia
## 3 3491 3423 bacterium SCGC AAA255-C06
## 4 3492 3423 clone OPB14
## 5 3493 3423 bacterium enrichment culture clone 73(2013)
## 6 3494 3423 bacterium enrichment culture clone AOM-SR-B11
## 7 3495 3423 hypersaline lake metagenome
## # ... with 3,835 more rows
##
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 513,120 x 3
## obs_taxon_ids id
## <chr> <chr>
## 1 91658 >AY230195.1.1496
## 2 36076 >AY230764.1.1512
## 3 63763 >AY230816.1.1447
## 4 15361 >AY232254.1.1432
## 5 3543 >AY234438.1.1303
## 6 22606 >AY234499.1.1317
## 7 54118 >AY234528.1.1429
## # ... with 5.131e+05 more rows, and 1 more variables: sequence <chr>
##
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies
# Replace all u with t so in silico PCR works
system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence))
## user system elapsed
## 29.697 0.000 29.726
# in silico PCR
system.time(silva_pcr <- primersearch(silva,
forward = forward_primer,
reverse = reverse_primer,
mismatch = max_mismatch))
## user system elapsed
## 52.802 0.749 53.770
system.time(silva_plot_pcr_all <- heat_tree(silva_pcr,
node_size = n_obs,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
edge_color_interval = c(0, 1),
node_color_interval = c(0, 1),
node_label_size_range = label_size_range,
node_label_max = label_max,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequence count",
output_file = file.path(output_folder, paste0("silva--pcr_all", output_format))))
## user system elapsed
## 55.059 0.340 55.523
print(silva_plot_pcr_all)
system.time(silva_plot_pcr_fail <- silva_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
heat_tree(node_size = n_obs - count_amplified,
node_label = name,
node_color = prop_amplified,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = pcr_size_interval,
edge_size_interval = pcr_size_interval,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
edge_color_interval = c(0, 1),
node_label_size_range = label_size_range,
node_label_max = 1000,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
make_legend = TRUE,
output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format))))
## user system elapsed
## 45.840 0.008 45.901
print(silva_plot_pcr_fail)
Some results from this file will be combined with other to make a composite figure. Below, the needed objects are saved so that they can be loaded by another Rmd file.
save(file = file.path(output_folder, "silva_data.RData"),
silva_plot_all, silva_plot_pcr_fail)
Comments